knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
We started by importing the metaphlan output, the species-level abundance matrix, and the sample metadata. The metaphlan table was transposed so that rows represent samples and columns represent taxa. We then removed a small set of unwanted samples (week 1, 12, 14, and 16 for mice B3, C3, and E2), and reformatted the remaining sample IDs to match the metadata convention.
In the metadata, we dropped backup samples with IDs containing “B3B”,
“C3B”, or “E2B”, and standardized labels so that “B3A”, “C3A”, and “E2A”
were replaced by “B3”, “C3”, and “E2” in both the sampleID
and Mouse_ID fields. Using these harmonized IDs, we kept
only samples present in both the microbiome table and the metadata and
constructed a combined dataset containing the Diet and Supplement
variables along with the microbial abundance profiles.
Next, we restricted the feature set to species-level taxa by keeping only columns whose names contained “|s_” and did not contain “|t_”. All microbial abundance columns were converted to numeric values. To reduce sparsity, we counted the number of zeros for each species across samples and retained only those species with zeros in fewer than 10% of samples. We then added a small pseudo-count, defined as one tenth of the smallest non-zero value in the matrix, to all zero entries.
Finally, we converted the abundance matrix to relative abundances by normalizing each row so that each row sum to one, i.e., each entry represents the proportion of that species within a sample.
raw <- read.table("data/merged_metaphlan.txt")
data <- read.table("data/new_mat_prop.txt")
meta <- read.table(
"data/DMOF6_merged_metadata_ag.txt",
header = TRUE,
sep = "\t",
fill = TRUE,
quote = "",
comment.char = "")
#flip row and column
raw_flipped <- as.data.frame(t(raw[,-1]))
colnames(raw_flipped) <- raw[[1]]
rownames(raw_flipped) <- raw_flipped$clade_name
raw_flipped$clade_name <- NULL
rows_to_remove <- c(
"wk12_B3", "wk14_B3", "wk16_B3", "wk1_B3",
"wk12_C3", "wk14_C3", "wk16_C3", "wk1_C3",
"wk12_E2", "wk14_E2", "wk16_E2", "wk1_E2"
)
raw_flipped <- raw_flipped[ !(rownames(raw_flipped) %in% rows_to_remove), ]
#modify row names
filtered <- raw_flipped
rownames(filtered) <- sub("^wk(\\d+)_([A-Za-z0-9]+)$", "DMOF6_\\2_wk\\1", rownames(filtered))
library(dplyr)
library(stringr)
library(tibble)
meta_clean <- meta %>%
filter(!str_detect(sampleID, "B3B|E2B|C3B")) %>% #delete 12 rows
mutate(
sampleID = str_replace_all(sampleID, c("B3A" = "B3", "C3A" = "C3", "E2A" = "E2")),
Mouse_ID = str_replace_all(Mouse_ID, c("B3A" = "B3", "C3A" = "C3", "E2A" = "E2"))
)
common_ids <- intersect(rownames(filtered), meta_clean$sampleID)
#length(common_ids)
#combine metadata and filtered
missing_ids <- setdiff(rownames(filtered), meta_clean$sampleID)
#missing_ids
filtered_sub <- filtered[common_ids, , drop = FALSE]
meta_sub <- meta_clean %>%
filter(sampleID %in% common_ids) %>%
arrange(match(sampleID, common_ids)) %>%
column_to_rownames("sampleID")
combined <- cbind(meta_sub, filtered_sub) #combined data with all columns
diet_col <- meta_clean %>%
filter(sampleID %in% common_ids) %>%
arrange(match(sampleID, common_ids)) %>%
pull(Diet)
supp_col <- meta_clean %>%
filter(sampleID %in% common_ids) %>%
arrange(match(sampleID, common_ids)) %>%
pull(Supplement)
combined_diet <- cbind(
Diet = diet_col,
Supplement = supp_col,
filtered_sub
)
#combined data with Diet, Supplement
#filter out column according to names
cols_keep <- colnames(combined_diet) %in% c("Diet", "Supplement") | (
grepl("\\|s_", colnames(combined_diet)) & # must contain “|s_”, only leave species
!grepl("t_", colnames(combined_diet)) # not contain “|t_”, remove all of the taxa
)
combined_diet <- combined_diet[ , cols_keep]
#convert to numeric
combined_diet <- combined_diet %>%
mutate(across(-c(Diet, Supplement), ~ as.numeric(as.character(.))))
#count zeros
non_meta_cols <- setdiff(names(combined_diet), c("Diet", "Supplement"))
zero_counts <- colSums(combined_diet[ , non_meta_cols] == 0, na.rm = TRUE)
n_samples <- nrow(combined_diet)
thr <- 0.10 * n_samples
taxa_keep <- non_meta_cols[ zero_counts < thr ]
combined_diet <- combined_diet[ , c("Diet", "Supplement", taxa_keep) ]
mat <- as.matrix(combined_diet[ , -c(1,2)])
pseudo <- min(mat[mat > 0]) / 10
mat[mat == 0] <- pseudo
#pseudo
mat_prop <- sweep(mat, 1, rowSums(mat), "/")
combined_diet <- cbind(
Diet = combined_diet$Diet,
Supplement = combined_diet$Supplement,
mat_prop
)
write.csv(
combined_diet,
file = "data/combined_diet.csv"
)
We first quantified within-sample (alpha) diversity using the Shannon index. For each sample, we computed Shannon diversity from the species-level relative abundance matrix and compared its distribution across weeks using boxplots. We also stratified the analysis by diet group (Lean vs High-Fat) to visualize how alpha diversity changes over time under different dietary conditions.
To assess between-sample (beta) diversity, we calculated Bray–Curtis dissimilarity on the same abundance matrix. We then performed principal coordinates analysis (PCoA) on the Bray–Curtis distance matrix and plotted the first two axes, with points colored by week and shaped by diet.
suppressMessages(library(vegan))
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
combined_diet <- as.data.frame(combined_diet)
combined_diet$SampleID <- rownames(combined_diet)
combined_diet$Week <- sub(".*_(wk\\d+)$", "\\1", combined_diet$SampleID)
combined_diet$Week <- factor(combined_diet$Week, levels = c("wk1", "wk12", "wk14", "wk16"))
abundance_cols <- setdiff(colnames(combined_diet), c("Diet", "Supplement", "SampleID", "Week"))
abundance_data <- combined_diet[, abundance_cols]
abundance_data <- as.data.frame(sapply(abundance_data, as.numeric))
combined_diet$Shannon <- diversity(abundance_data, index = "shannon")
ggplot(combined_diet, aes(x = Week, y = Shannon)) +
geom_boxplot(fill = "skyblue", outlier.shape = NA) +
geom_jitter(width = 0.2, size = 1.5, alpha = 0.5) +
labs(
title = "Shannon Diversity Index across Weeks",
x = "Week",
y = "Shannon Index"
) +
theme_minimal()
combined_lean <- combined_diet %>% filter(Diet == "Lean")
combined_hf <- combined_diet %>% filter(Diet == "HighFat")
p1 <- ggplot(combined_lean, aes(x = Week, y = Shannon)) +
geom_boxplot(fill = "lightgreen", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
labs(title = "Lean Diet", x = "Week", y = "Shannon Index") +
theme_minimal()
p2 <- ggplot(combined_hf, aes(x = Week, y = Shannon)) +
geom_boxplot(fill = "lightyellow", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
labs(title = "High-Fat Diet", x = "Week", y = "Shannon Index") +
theme_minimal()
p1
p2
suppressMessages(library(vegan))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(tibble))
abundance_data <- as.data.frame(sapply(abundance_data, as.numeric))
# Calsulate Bray–Curtis β-diversity
dist_bray <- vegan::vegdist(abundance_data, method = "bray") # 或 vegdist(ab_hell, "bray")
pcoa_fit <- cmdscale(dist_bray, k = 2, eig = TRUE)
pcoa_df <- tibble(
SampleID = combined_diet$SampleID,
PC1 = pcoa_fit$points[, 1],
PC2 = pcoa_fit$points[, 2]
) %>%
left_join(combined_diet[, c("SampleID", "Diet", "Supplement", "Week")], by = "SampleID")
eig_vals <- pcoa_fit$eig[pcoa_fit$eig > 0]
var_exp <- eig_vals / sum(eig_vals)
pc1_var <- if (length(var_exp) >= 1) round(100 * var_exp[1], 1) else NA
pc2_var <- if (length(var_exp) >= 2) round(100 * var_exp[2], 1) else NA
ggplot(pcoa_df, aes(x = PC1, y = PC2, color = Week, shape = Diet)) +
geom_point(size = 2, alpha = 0.9) +
labs(
title = paste0(
"PCoA on Bray–Curtis β-diversity",
if (!is.na(pc1_var) && !is.na(pc2_var))
paste0(" (PC1 ", pc1_var, "%, PC2 ", pc2_var, "%)")
),
x = "PCoA 1", y = "PCoA 2"
) +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 9))
To visualize overall structure, we generated an heatmap in which both samples and taxa were clustered, providing a global view of abundance patterns without any metadata overlays.
We then explored the appropriate number of clusters using hierarchical clustering on the taxa proportion matrix with Ward’s \(D^2\) linkage. We examined three standard diagnostics—the within-cluster sum of squares (elbow), average silhouette width, and the gap statistic—across a range of cluster numbers.
For the final heatmap, we clustered samples into two groups using hierarchical clustering with Ward’s D² method, by cutting the dendrogram at k=2. We added a top annotation bar to show three key variables: diet (Lean vs HighFat), supplement (Cellulose vs Oligofructose), and week (wk1, wk12, wk14, wk16).
combined_diet <- read.csv("data/combined_diet.csv",
header = TRUE,
row.names = 1,
stringsAsFactors = FALSE)
suppressMessages(library(tidyverse))
suppressMessages(library(ComplexHeatmap))
suppressMessages(library(factoextra))
suppressMessages(library(cluster))
suppressMessages(library(cowplot))
taxa_prop <- combined_diet %>%
select(-Diet, -Supplement) %>%
as.matrix()
#initial heatmap
suppressMessages(
taxa_prop %>%
t() %>%
Heatmap(name = "abundance",
cluster_rows = TRUE,
cluster_columns = TRUE,
column_labels = rep(" ", nrow(taxa_prop)),
row_labels = rep(" ", ncol(taxa_prop)),
row_names_gp = gpar(fontsize = 1.5),
column_title = "Taxa Abundance Heatmap before clustering" )
)
cat("Elbow")
## Elbow
# Elbow
fviz_nbclust(taxa_prop, hcut, method = "wss", hc_method = "ward.D2")
# Silhouette
cat("Silhouette")
## Silhouette
fviz_nbclust(taxa_prop, hcut, method = "silhouette", hc_method = "ward.D2")
# Gap Statistic
cat("Gap Statistic")
## Gap Statistic
set.seed(1)
gap_stat <- clusGap(taxa_prop, FUN = hcut, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
bc_dist <- vegdist(taxa_prop, method = "bray")
clust.res <- hclust(bc_dist, method = "ward.D2")
k <- 2
my.clusts <- cutree(clust.res, k = k)
# cluster color
cluster_col <- setNames(c("orange", "green4"), c("1", "2"))
samples <- rownames(taxa_prop)
## Diet
diet_vec <- factor(
combined_diet[samples, "Diet"],
levels = c("Lean", "HighFat")
)
diet_col <- c(Lean = "#4f8ad1",
HighFat = "#d9534f")
## Week
get_week <- function(x) {
if (grepl("_wk1$", x, ignore.case = TRUE)) "wk1"
else if (grepl("wk12", x, ignore.case = TRUE)) "wk12"
else if (grepl("wk14", x, ignore.case = TRUE)) "wk14"
else if (grepl("wk16", x, ignore.case = TRUE)) "wk16"
else NA_character_
}
week_vec <- factor(
vapply(samples, get_week, character(1)),
levels = c("wk1", "wk12", "wk14", "wk16")
)
week_col <- c(wk1 = "#0080ff",
wk12 = "#ffcf00",
wk14 = "#ff66b2",
wk16 = "#00a651")
#Supplement
supp_vec <- factor(
combined_diet[samples, "Supplement"],
levels = c("Cellulose", "Oligofructose")
)
supp_col <- c(Cellulose = "#8da0cb",
Oligofructose = "#fc8d62")
#annotation
top_anno <- HeatmapAnnotation(
supp = supp_vec,
diet = diet_vec,
week = week_vec,
col = list(
diet = diet_col,
week = week_col,
supp = supp_col
),
annotation_name_side = "left",
annotation_height = unit(c(4, 4, 4), "mm")
)
suppressMessages(
ht <- taxa_prop %>%
t() %>%
Heatmap(
name = "abundance",
clustering_method_columns = "ward.D2",
clustering_distance_columns = function(M) vegdist(M, "bray"),
column_labels = rep(" ", nrow(taxa_prop)),
row_labels = rep(" ", ncol(taxa_prop)),
top_annotation = top_anno,
column_title = "Taxa Abundance Heatmap Hierarchical clustering"
)
)
draw(ht)
png("~/Desktop/heatmap.png", width = 2400, height = 1800, res = 300)
draw(ht)
invisible(dev.off())
We applied the centered log-ratio (CLR) transformation to the species-level proportion matrix.
suppressMessages(library(tidyverse))
suppressMessages(library(compositions))
combined_diet <- read.csv(
"data/combined_diet.csv",
header = TRUE,
row.names = 1,
stringsAsFactors = FALSE
)
diet_col <- combined_diet$Diet
supp_col <- combined_diet$Supplement
taxa_mat <- combined_diet %>%
select(-Diet, -Supplement) %>%
as.matrix()
clr_mat <- clr(taxa_mat)
combined_clr <- data.frame(
Diet = diet_col,
Supplement = supp_col,
clr_mat,
check.names = FALSE
)
row_sums <- rowSums(clr_mat)
combined_clr <- combined_clr %>%
mutate(Group = interaction(Diet, Supplement, sep = "+", drop = TRUE)) %>%
relocate(Group, .after = Supplement)
write.csv(combined_clr,
"data/combined_clr.csv",
row.names = TRUE)
To find taxa that best separate Lean and High-Fat diets, we fit LASSO logistic regression models using the CLR-transformed abundance matrix. Diet (HighFat vs Lean) was the binary outcome, and species-level CLR values were the predictors.
First, we fit separate LASSO models at each week and across all weeks. For each time point, we used 10-fold cross-validation with the 1-SE rule to choose the penalty \(\lambda\) and then recorded the taxa with non-zero coefficients. Positive coefficients correspond to taxa enriched in the High-Fat group, and negative coefficients correspond to taxa enriched in the Lean group. For these selected taxa, we made bar plots of the coefficients and violin plots of CLR values by diet.
## ## LASSO by Week
## ### wk1
##
## **Number of selected taxa:** 7
## ### wk12
##
## **Number of selected taxa:** 8
## ### wk14
##
## **Number of selected taxa:** 9
## ### wk16
##
## **Number of selected taxa:** 6
cat("Lasso across all weeks")
## Lasso across all weeks
suppressMessages(library(tidyverse))
suppressMessages(library(glmnet))
set.seed(1)
y <- ifelse(combined_clr$Diet == "HighFat", 1, 0) # 0/1 outcome
X <- combined_clr %>%
select(-SampleID, -Batch, -MouseID, -Week,
-Diet, -Supplement, -Group) %>%
as.matrix()
cvfit <- cv.glmnet(
x = X,
y = y,
family = "binomial",
alpha = 1,
nfolds = 10,
standardize = TRUE
)
lambda_opt <- cvfit$lambda.1se
# Diet ~ CLRtransformed abundance
fit <- glmnet(
x = X,
y = y,
family = "binomial",
alpha = 1,
lambda = lambda_opt,
standardize = TRUE
)
coef_vec <- coef(fit)
sel_idx <- which(coef_vec[-1, 1] != 0)
selected_taxa <- rownames(coef_vec)[-1][sel_idx]
lasso_tbl <- tibble(
Taxon = selected_taxa,
Coefficient = as.numeric(coef_vec[-1, 1][sel_idx]),
Direction = ifelse(Coefficient > 0, "HighFat↑", "Lean↑")
) %>% arrange(desc(abs(Coefficient)))
#print(lasso_tbl)
write.csv(lasso_tbl,
"data/lasso_selected_taxa.csv",
row.names = FALSE)
#cat("Number of selected taxa:", nrow(lasso_tbl), "\n")
Number of significant taxa: 16
coef_vec <- coef(fit)
coef_df <- data.frame(
Taxon = rownames(coef_vec),
Coefficient = as.numeric(coef_vec[, 1])
) %>%
filter(Taxon != "(Intercept)")
#coef_df
library(stringr)
library(ggplot2)
lasso_tbl <- lasso_tbl %>%
mutate(
ShortTaxon = str_extract(Taxon, "[^\\.]+$")
)
ggplot(lasso_tbl, aes(x = reorder(ShortTaxon, Coefficient), y = Coefficient, fill = Direction)) +
geom_col() +
coord_flip() +
labs(
title = "Lasso-Selected Taxa",
x = "Taxon",
y = "Lasso Coefficient"
) +
scale_fill_manual(values = c("HighFat↑" = "tomato", "Lean↑" = "skyblue")) +
theme_minimal(base_size = 12)
library(ggpubr)
selected_taxa <- lasso_tbl$Taxon
long_df <- combined_diet %>%
select(Diet, all_of(selected_taxa)) %>%
pivot_longer(
cols = -Diet,
names_to = "Taxon",
values_to = "Abundance"
)
ggplot(long_df, aes(x = Diet, y = Abundance, fill = Diet)) +
geom_violin(trim = FALSE, alpha = 0.4, scale = "width") +
geom_jitter(width = 0.2, size = 1.2, alpha = 0.4, color = "black") +
stat_compare_means(method = "wilcox.test", label = "p.signif") +
facet_wrap(~ Taxon, scales = "free_y", ncol = 4) +
labs(
title = "Abundance of Lasso-Selected Taxa by Diet",
x = "Diet",
y = "Abundance"
) +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(size = 9, face = "bold"),
legend.position = "top"
)
ggsave("lasso_violin_plot.png", width = 16, height = 16, dpi = 300)
#selected_taxa
We evaluated how well the microbiome profile can predict diet. For each week (wk1, wk12, wk14, wk16), we restricted to samples where both diet groups were present and removed taxa with zero variance. We then repeatedly split the data into 80% training and 20% test sets (50 repeats). In each repeat, similar to the previous step, we used cross-validated LASSO (with the 1-SE rule for \(\lambda\)) on the training set and recorded the misclassification error on both training and test sets. The summary table shows, for each week, the number of Lean and High-Fat samples, along with the mean and standard deviation of the train and test errors across the 50 repeats. We applied the same procedure to all weeks combined. The null model error(random guessing) is 0.5 since the outcome is binary. Since the observed test error is lower than this benchmark, it suggests that the model provides meaningful predictive signal for distinguishing Lean from High-Fat diets.
suppressMessages(library(tidyverse))
suppressMessages(library(glmnet))
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))
set.seed(1)
weeks <- c("wk1", "wk12", "wk14", "wk16")
taxa_cols <- setdiff(
colnames(combined_clr),
c("SampleID","Batch","MouseID","Week","Diet","Supplement","Group")
)
n_repeats <- 50
results_list <- list()
for (wk in weeks) {
df_w <- combined_clr %>% dplyr::filter(.data$Week == wk)
n_samples <- nrow(df_w)
diet_tab <- table(df_w$Diet)
lean_n <- unname(ifelse("Lean" %in% names(diet_tab), diet_tab["Lean"], 0))
hf_n <- unname(ifelse("HighFat" %in% names(diet_tab), diet_tab["HighFat"], 0))
if (n_samples < 10 || length(unique(df_w$Diet)) < 2) {
results_list[[wk]] <- tibble(
Week = wk, N = n_samples, Lean = as.integer(lean_n), HighFat = as.integer(hf_n),
Train_Error_Mean = NA_real_, Train_Error_SD = NA_real_,
Test_Error_Mean = NA_real_, Test_Error_SD = NA_real_
)
next
}
y <- ifelse(df_w$Diet == "HighFat", 1, 0)
X <- as.matrix(df_w[, taxa_cols, drop = FALSE])
nzv <- apply(X, 2, sd, na.rm = TRUE) > 0
X <- X[, nzv, drop = FALSE]
if (ncol(X) == 0) {
results_list[[wk]] <- tibble(
Week = wk, N = n_samples, Lean = as.integer(lean_n), HighFat = as.integer(hf_n),
Train_Error_Mean = NA_real_, Train_Error_SD = NA_real_,
Test_Error_Mean = NA_real_, Test_Error_SD = NA_real_
)
next
}
train_errs <- numeric(n_repeats)
test_errs <- numeric(n_repeats)
for (r in seq_len(n_repeats)) {
set.seed(r)
n <- nrow(X)
tr_idx <- sample(seq_len(n), size = floor(0.8 * n))
X_tr <- X[tr_idx, , drop = FALSE]
y_tr <- y[tr_idx]
X_te <- X[-tr_idx, , drop = FALSE]
y_te <- y[-tr_idx]
cvfit <- cv.glmnet(
x = X_tr, y = y_tr,
family = "binomial", alpha = 1,
nfolds = 10, standardize = TRUE
)
fit <- glmnet(
x = X_tr, y = y_tr,
family = "binomial", alpha = 1,
lambda = cvfit$lambda.1se, standardize = TRUE
)
pred_tr <- ifelse(predict(fit, X_tr, type = "response") > 0.5, 1, 0)
pred_te <- ifelse(predict(fit, X_te, type = "response") > 0.5, 1, 0)
train_errs[r] <- mean(pred_tr != y_tr)
test_errs[r] <- mean(pred_te != y_te)
}
results_list[[wk]] <- tibble(
Week = wk,
N = n_samples,
Lean = as.integer(lean_n),
HighFat = as.integer(hf_n),
`Train Error (mean)` = mean(train_errs),
`Train Error (sd)` = sd(train_errs),
`Test Error (mean)` = mean(test_errs),
`Test Error (sd)` = sd(test_errs)
)
}
results_tbl <- bind_rows(results_list) %>%
dplyr::mutate(
Week = factor(Week, levels = c("wk1","wk12","wk14","wk16"))
) %>%
dplyr::arrange(Week) %>%
dplyr::mutate(
dplyr::across(where(is.numeric), ~ round(., 3))
)
knitr::kable(
results_tbl,
format = "html",
caption = "Predictive modeling by week (LASSO, 50 repeats of 80/20 splits)",
align = "lrrrrrrrr"
) %>%
kableExtra::kable_styling(
full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| Week | N | Lean | HighFat | Train Error (mean) | Train Error (sd) | Test Error (mean) | Test Error (sd) |
|---|---|---|---|---|---|---|---|
| wk1 | 26 | 14 | 12 | 0.000 | 0.000 | 0.003 | 0.024 |
| wk12 | 29 | 14 | 15 | 0.001 | 0.006 | 0.033 | 0.095 |
| wk14 | 29 | 14 | 15 | 0.001 | 0.006 | 0.093 | 0.162 |
| wk16 | 28 | 14 | 14 | 0.032 | 0.036 | 0.133 | 0.178 |
suppressMessages(library(glmnet))
suppressMessages(library(tidyverse))
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))
set.seed(1)
n_repeats <- 50
train_error_vec <- numeric(n_repeats)
test_error_vec <- numeric(n_repeats)
for (r in seq_len(n_repeats)) {
set.seed(r)
n <- nrow(X)
train_idx <- sample(1:n, size = 0.8 * n)
X_train <- X[train_idx, ]
y_train <- y[train_idx]
X_test <- X[-train_idx, ]
y_test <- y[-train_idx]
cvfit_train <- cv.glmnet(
x = X_train,
y = y_train,
family = "binomial",
alpha = 1,
nfolds = 10,
standardize = TRUE
)
lambda_opt <- cvfit_train$lambda.1se
fit_train <- glmnet(
x = X_train,
y = y_train,
family = "binomial",
alpha = 1,
lambda = lambda_opt,
standardize = TRUE
)
yhat_train <- predict(fit_train, X_train, type = "response")
yhat_test <- predict(fit_train, X_test, type = "response")
pred_train <- ifelse(yhat_train > 0.5, 1, 0)
pred_test <- ifelse(yhat_test > 0.5, 1, 0)
train_error <- mean(pred_train != y_train)
test_error <- mean(pred_test != y_test)
train_error_vec[r] <- train_error
test_error_vec[r] <- test_error
}
results_tbl <- tibble(
N = 112,
`Train Error (mean)` = mean(train_error_vec),
`Train Error (sd)` = sd(train_error_vec),
`Test Error (mean)` = mean(test_error_vec),
`Test Error (sd)` = sd(test_error_vec),
`Null Model Error` = 0.5
) |>
mutate(across(where(is.numeric), ~ round(., 3)))
knitr::kable(
results_tbl,
format = "html",
caption = "Predictive modeling across all weeks(LASSO, 50 repeats of 80/20 splits)",
align = "rrrrrr"
) |>
kableExtra::kable_styling(
full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| N | Train Error (mean) | Train Error (sd) | Test Error (mean) | Test Error (sd) | Null Model Error |
|---|---|---|---|---|---|
| 112 | 0.032 | 0.036 | 0.133 | 0.178 | 0.5 |
To explore how taxa vary together, we constructed co-occurrence networks using the CLR-transformed abundance data. For each week, we took the taxa selected by the LASSO models and, one taxon at a time, regressed it on the others using LASSO. The penalty was chosen by minimizing an extended BIC (EBIC). I repeated the same neighborhood-selection procedure within each Diet + Supplement group, and once again using all samples combined. In the network plots, nodes represent taxa and edges indicate nonzero conditional associations. Edge width reflects the strength of the association, and edge color shows its sign (blue for positive, red for negative).
suppressMessages(library(igraph))
suppressMessages(library(stringr))
suppressMessages(library(dplyr))
stopifnot(exists("combined_clr"), exists("weeks"))
stopifnot(exists("lasso_selected_by_week"))
rescale01 <- function(x, to = c(0.6, 3.8)) {
r <- range(x, na.rm = TRUE)
if (!is.finite(r[1]) || !is.finite(r[2]) || r[2] - r[1] == 0) {
return(rep(mean(to), length(x)))
}
(x - r[1]) / (r[2] - r[1]) * (to[2] - to[1]) + to[1]
}
# EBIC
ebic <- function(rss, df, n, p, gamma = 0.5) n*log(rss/n) + df*log(n) + 2*gamma*df*log(p)
gamma <- 0.5
tau <- 0
for (wk in weeks) {
cat("\n### Week:", wk, "\n")
sel_names <- lasso_selected_by_week[[wk]]
if (is.null(sel_names) || length(sel_names) <= 1) {
cat("No significant taxa in this week; skipped.\n")
next
}
df_w <- combined_clr %>% dplyr::filter(.data$Week == wk)
sel_names <- intersect(sel_names, colnames(df_w))
if (length(sel_names) <= 1) {
cat("Selected taxa not present in this week; skipped.\n")
next
}
X <- as.matrix(df_w[, sel_names, drop = FALSE])
n <- nrow(X); p <- ncol(X)
colnames(X) <- sel_names
set.seed(1)
B <- matrix(0, p, p, dimnames = list(colnames(X), colnames(X)))
for (i in seq_len(p)) {
y <- X[, i]
Xo <- X[, -i, drop = FALSE]
fit <- glmnet::glmnet(x = Xo, y = y, alpha = 1, family = "gaussian", standardize = TRUE)
Yhat <- predict(fit, Xo)
rss <- colSums((y - Yhat)^2)
k <- which.min(ebic(rss, fit$df, n, p - 1, gamma = gamma))
co <- as.numeric(coef(fit, s = fit$lambda[k]))[-1]
B[i, -i] <- co
}
W <- (B + t(B)) / 2
diag(W) <- 0
adj <- (W != 0) * 1
g <- igraph::graph_from_adjacency_matrix(adj, mode = "undirected", diag = FALSE)
get_w <- function(W, g) {
el <- igraph::as_edgelist(g, names = FALSE)
apply(el, 1, function(ix) W[ix[1], ix[2]])
}
w <- get_w(W, g)
keep_e <- which(abs(w) >= tau)
if (length(keep_e) == 0) {
cat(sprintf("No edges with |coef| >= %.2f; skipped.\n", tau))
next
}
g_tau <- igraph::subgraph.edges(g, eids = keep_e, delete.vertices = FALSE)
w_tau <- get_w(W, g_tau)
coords <- igraph::layout_in_circle(g_tau)
theta <- atan2(coords[,2], coords[,1])
short_names <- sapply(igraph::V(g_tau)$name, function(x){
sp <- stringr::str_extract(x, "(?<=\\.s__)[^\\._]+")
if (!is.na(sp) && nzchar(sp)) return(sp)
ge <- stringr::str_extract(x, "(?<=\\.g__)[^\\._]+")
if (!is.na(ge) && nzchar(ge)) return(ge)
tail(strsplit(x, "_", fixed = TRUE)[[1]], 1)
})
ang_deg_raw <- theta * 180 / pi
flip <- (ang_deg_raw > 90 & ang_deg_raw < 270)
ang_deg <- ifelse(flip, ang_deg_raw - 180, ang_deg_raw)
hjust <- ifelse(flip, 1, 0)
#plot
e_width <- rescale01(abs(w_tau), to = c(0.6, 4.0))
e_color <- ifelse(w_tau >= 0, "steelblue", "firebrick")
e_curve <- 0.15
node_cols <- grDevices::rainbow(igraph::vcount(g_tau), s = 0.9, v = 0.9)
enlarge <- 1.30
lab_push <- 1.08
label_cex <- 0.55
layout_scaled <- coords * enlarge
lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
lim <- 1.02 * enlarge * lab_push
op <- par(mar = c(3,3,5,3), xpd = NA, xaxs = "i", yaxs = "i")
plot(
g_tau,
layout = layout_scaled,
vertex.size = 4.2,
vertex.label = NA,
vertex.color = node_cols,
vertex.frame.color = NA,
edge.width = e_width,
edge.color = e_color,
edge.curved = e_curve,
rescale = FALSE,
xlim = c(-lim, lim), ylim = c(-lim, lim),
asp = 1
)
title(sprintf("Week %s Taxa Co-occurrence |coef| ≥ %.2f", wk, tau), line = 3)
for (i in seq_len(igraph::vcount(g_tau))) {
text(lab_xy_scaled[i,1], lab_xy_scaled[i,2], short_names[i],
srt = ang_deg[i], adj = hjust[i],
cex = label_cex, col = "black")
}
par(op)
}
##
## ### Week: wk1
##
## ### Week: wk12
##
## ### Week: wk14
##
## ### Week: wk16
suppressMessages(library(igraph))
suppressMessages(library(stringr))
rescale01 <- function(x, to = c(0.6, 3.8)) {
r <- range(x, na.rm = TRUE)
if (!is.finite(r[1]) || !is.finite(r[2]) || r[2] - r[1] == 0) {
return(rep(mean(to), length(x)))
}
(x - r[1]) / (r[2] - r[1]) * (to[2] - to[1]) + to[1]
}
groups <- unique(combined_clr$Group)
for (gname in groups) {
cat("\n### Group:", gname, "\n")
df_g <- combined_clr %>% dplyr::filter(Group == gname)
sel_names <- NULL
if (exists("selected_taxa")) sel_names <- selected_taxa
if (is.null(sel_names) && exists("lasso_tbl")) sel_names <- unique(lasso_tbl$Taxon)
if (is.null(sel_names) || length(sel_names) <= 1) {
cat("No significant taxa for this group; skipped.\n")
next
}
X <- as.matrix(df_g[, sel_names, drop = FALSE])
colnames(X) <- sel_names
n <- nrow(X); p <- ncol(X)
ebic <- function(rss, df, n, p, gamma = 0.5) {
n*log(rss/n) + df*log(n) + 2*gamma*df*log(p)
}
gamma <- 0.5
B <- matrix(0, p, p, dimnames = list(colnames(X), colnames(X)))
for (i in seq_len(p)) {
y <- X[, i]
Xo <- X[, -i, drop = FALSE]
fit <- glmnet(x = Xo, y = y, alpha = 1, family = "gaussian", standardize = TRUE)
Yhat <- predict(fit, Xo)
rss <- colSums((y - Yhat)^2)
k <- which.min(ebic(rss, fit$df, n, p - 1, gamma = gamma))
co <- as.numeric(coef(fit, s = fit$lambda[k]))[-1]
B[i, -i] <- co
}
W <- (B + t(B)) / 2
diag(W) <- 0
adj <- (W != 0) * 1
g <- graph_from_adjacency_matrix(adj, mode = "undirected", diag = FALSE)
get_w <- function(W, g) {
el <- as_edgelist(g, names = FALSE)
apply(el, 1, function(ix) W[ix[1], ix[2]])
}
w <- get_w(W, g)
e_width <- rescale01(abs(w), to = c(0.6, 4.0))
e_color <- ifelse(w >= 0, "steelblue", "firebrick")
e_curve <- 0.15
node_cols <- grDevices::rainbow(vcount(g), s = 0.9, v = 0.9)
coords <- layout_in_circle(g)
theta <- atan2(coords[,2], coords[,1])
short_names <- sapply(V(g)$name, function(x){
sp <- str_extract(x, "(?<=\\.s__)[^\\._]+")
if (!is.na(sp) && nzchar(sp)) return(sp)
ge <- str_extract(x, "(?<=\\.g__)[^\\._]+")
if (!is.na(ge) && nzchar(ge)) return(ge)
tail(strsplit(x, "_", fixed = TRUE)[[1]], 1)
})
flip <- (theta * 180 / pi > 90 & theta * 180 / pi < 270)
ang_deg <- ifelse(flip, theta*180/pi - 180, theta*180/pi)
hjust <- ifelse(flip, 1, 0)
tau <- 0
keep_e <- which(abs(w) >= tau)
g_tau <- subgraph.edges(g, eids = keep_e, delete.vertices = FALSE)
w_tau <- get_w(W, g_tau)
enlarge <- 1.30
lab_push <- 1.08
label_cex <- 0.55
layout_scaled <- coords * enlarge
lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
lim <- 1.02 * enlarge * lab_push
op <- par(mar = c(3,3,5,3), xpd = NA, xaxs = "i", yaxs = "i")
plot(
g_tau,
layout = layout_scaled,
vertex.size = 4.2,
vertex.label = NA,
vertex.color = node_cols,
vertex.frame.color = NA,
edge.width = rescale01(abs(w_tau), to = c(0.6, 4.0)),
edge.color = ifelse(w_tau >= 0, "steelblue", "firebrick"),
edge.curved = e_curve,
rescale = FALSE,
xlim = c(-lim, lim), ylim = c(-lim, lim),
asp = 1
)
title(sprintf("Group: %s Taxa Co-occurrence |coef| ≥ %.2f", gname, tau), line = 3)
for (i in seq_len(vcount(g_tau))) {
text(lab_xy_scaled[i,1], lab_xy_scaled[i,2], short_names[i],
srt = ang_deg[i], adj = hjust[i],
cex = label_cex, col = "black")
}
par(op)
}
##
## ### Group: Lean+Cellulose
##
## ### Group: HighFat+Cellulose
##
## ### Group: Lean+Oligofructose
##
## ### Group: HighFat+Oligofructose
suppressMessages(library(dplyr))
suppressMessages(library(stringr))
sel_names <- NULL
if (exists("selected_taxa")) sel_names <- selected_taxa
if (is.null(sel_names) && exists("lasso_tbl")) sel_names <- unique(lasso_tbl$Taxon)
stopifnot("no available taxa" = length(sel_names) > 1)
message("Using ", length(sel_names), " taxa")
X <- as.matrix(combined_clr[, sel_names, drop = FALSE])
colnames(X) <- sel_names
n <- nrow(X); p <- ncol(X)
ebic <- function(rss, df, n, p, gamma = 0.5) n*log(rss/n) + df*log(n) + 2*gamma*df*log(p)
gamma <- 0.5
set.seed(1)
B <- matrix(0, p, p, dimnames = list(colnames(X), colnames(X)))
for (i in seq_len(p)) {
y <- X[, i]
Xo <- X[, -i, drop = FALSE]
fit <- glmnet(x = Xo, y = y, alpha = 1, family = "gaussian", standardize = TRUE)
Yhat <- predict(fit, Xo)
rss <- colSums((y - Yhat)^2)
k <- which.min(ebic(rss, fit$df, n, p - 1, gamma = gamma))
co <- as.numeric(coef(fit, s = fit$lambda[k]))[-1]
B[i, -i] <- co
}
W <- (B + t(B)) / 2
diag(W) <- 0
idx <- which(W != 0, arr.ind = TRUE)
edges_df <- data.frame(
from = colnames(W)[idx[,1]],
to = colnames(W)[idx[,2]],
coef = W[idx],
stringsAsFactors = FALSE
)
edges_df <- edges_df[edges_df$from < edges_df$to, , drop = FALSE]
edges_df <- edges_df[order(-abs(edges_df$coef)), , drop = FALSE]
#Taxa: 16 #Non-zero edges: 42
library(igraph)
library(stringr)
rescale01 <- function(x, to = c(0.6, 3.8)) {
r <- range(x, na.rm = TRUE)
if (!is.finite(r[1]) || !is.finite(r[2]) || r[2] - r[1] == 0) {
return(rep(mean(to), length(x)))
}
(x - r[1]) / (r[2] - r[1]) * (to[2] - to[1]) + to[1]
}
stopifnot(exists("W"), is.matrix(W), !is.null(colnames(W)), nrow(W) == ncol(W))
adj <- (W != 0) * 1
g <- graph_from_adjacency_matrix(adj, mode = "undirected", diag = FALSE)
get_w <- function(W, g) {
el <- as_edgelist(g, names = FALSE)
apply(el, 1, function(ix) W[ix[1], ix[2]])
}
w <- get_w(W, g)
e_width <- rescale01(abs(w), to = c(0.6, 4.0)) # higher correlation if wider
e_color <- ifelse(w >= 0, "steelblue", "firebrick") # positive: blue, negative: red
e_curve <- 0.15
set.seed(1)
node_cols <- grDevices::rainbow(vcount(g), s = 0.9, v = 0.9)
coords <- layout_in_circle(g)
theta <- atan2(coords[,2], coords[,1])
short_names <- sapply(V(g)$name, function(x){
sp <- str_extract(x, "(?<=\\.s__)[^\\._]+")
if (!is.na(sp) && nzchar(sp)) return(sp)
ge <- str_extract(x, "(?<=\\.g__)[^\\._]+")
if (!is.na(ge) && nzchar(ge)) return(ge)
tail(strsplit(x, "_", fixed = TRUE)[[1]], 1)
})
lab_r <- 1.12
lab_xy <- cbind(coords[,1]*lab_r, coords[,2]*lab_r)
ang_deg_raw <- theta * 180 / pi
flip <- (ang_deg_raw > 90 & ang_deg_raw < 270)
ang_deg <- ifelse(flip, ang_deg_raw - 180, ang_deg_raw)
hjust <- ifelse(flip, 1, 0)
enlarge <- 1.30
lab_push <- 1.08
label_cex <- 0.55
layout_scaled <- coords * enlarge
lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
lim <- 1.02 * enlarge * lab_push
op <- par(mar = c(3,3,5,3), xpd = NA, xaxs = "i", yaxs = "i")
set.seed(123)
plot(
g,
layout = layout_scaled,
vertex.size = 4.2,
vertex.label = NA,
vertex.color = node_cols,
vertex.frame.color = NA,
edge.width = e_width,
edge.color = e_color,
edge.curved = e_curve,
rescale = FALSE,
xlim = c(-lim, lim), ylim = c(-lim, lim),
asp = 1
)
title(main = "Taxa Co-occurrence across all weeks with |coef| ≥ 0", line = 4)
for (i in seq_len(vcount(g))) {
text(
x = lab_xy_scaled[i, 1],
y = lab_xy_scaled[i, 2],
labels = short_names[i],
srt = ang_deg[i],
adj = hjust[i],
cex = label_cex,
col = "black"
)
}
par(op)
plot_circle_net <- function(g_obj, w_vec, title_txt,
enlarge = 1.30, lab_push = 1.08,
label_cex = 0.55, top_mar = 5) {
e_width <- rescale01(abs(w_vec), to = c(0.6, 4.0))
e_color <- ifelse(w_vec >= 0, "steelblue", "firebrick")
layout_scaled <- coords * enlarge
lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
lim <- 1.02 * enlarge * lab_push
op <- par(mar = c(3,3,top_mar,3), xpd = NA, xaxs = "i", yaxs = "i")
plot(
g_obj,
layout = layout_scaled,
vertex.size = 4.2,
vertex.label = NA,
vertex.color = node_cols,
vertex.frame.color = NA,
edge.width = e_width,
edge.color = e_color,
edge.curved = e_curve,
rescale = FALSE, # ★ 放大生效
xlim = c(-lim, lim), ylim = c(-lim, lim),
asp = 1
)
title(main = title_txt, line = 2)
for (i in seq_len(vcount(g_obj))) {
text(lab_xy_scaled[i,1], lab_xy_scaled[i,2], short_names[i],
srt = ang_deg[i], adj = hjust[i],
cex = label_cex, col = "black")
}
par(op)
}
tau <- 0.1
keep_e <- which(abs(w) >= tau)
g_tau <- subgraph.edges(g, eids = keep_e, delete.vertices = FALSE)
w_tau <- get_w(W, g_tau)
plot_circle_net(
g_obj = g_tau,
w_vec = w_tau,
title_txt = "",
enlarge = 1.30,
lab_push = 1.08,
label_cex = 0.55,
top_mar = 6
)
title(sprintf("Taxa Co-occurrence across all weeks with |coef| ≥ %.2f", tau), line = 2.5)